sanitizer.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916
  1. """Deprecated from html5lib 1.1.
  2. See `here <https://github.com/html5lib/html5lib-python/issues/443>`_ for
  3. information about its deprecation; `Bleach <https://github.com/mozilla/bleach>`_
  4. is recommended as a replacement. Please let us know in the aforementioned issue
  5. if Bleach is unsuitable for your needs.
  6. """
  7. from __future__ import absolute_import, division, unicode_literals
  8. import re
  9. import warnings
  10. from xml.sax.saxutils import escape, unescape
  11. from pip._vendor.six.moves import urllib_parse as urlparse
  12. from . import base
  13. from ..constants import namespaces, prefixes
  14. __all__ = ["Filter"]
  15. _deprecation_msg = (
  16. "html5lib's sanitizer is deprecated; see " +
  17. "https://github.com/html5lib/html5lib-python/issues/443 and please let " +
  18. "us know if Bleach is unsuitable for your needs"
  19. )
  20. warnings.warn(_deprecation_msg, DeprecationWarning)
  21. allowed_elements = frozenset((
  22. (namespaces['html'], 'a'),
  23. (namespaces['html'], 'abbr'),
  24. (namespaces['html'], 'acronym'),
  25. (namespaces['html'], 'address'),
  26. (namespaces['html'], 'area'),
  27. (namespaces['html'], 'article'),
  28. (namespaces['html'], 'aside'),
  29. (namespaces['html'], 'audio'),
  30. (namespaces['html'], 'b'),
  31. (namespaces['html'], 'big'),
  32. (namespaces['html'], 'blockquote'),
  33. (namespaces['html'], 'br'),
  34. (namespaces['html'], 'button'),
  35. (namespaces['html'], 'canvas'),
  36. (namespaces['html'], 'caption'),
  37. (namespaces['html'], 'center'),
  38. (namespaces['html'], 'cite'),
  39. (namespaces['html'], 'code'),
  40. (namespaces['html'], 'col'),
  41. (namespaces['html'], 'colgroup'),
  42. (namespaces['html'], 'command'),
  43. (namespaces['html'], 'datagrid'),
  44. (namespaces['html'], 'datalist'),
  45. (namespaces['html'], 'dd'),
  46. (namespaces['html'], 'del'),
  47. (namespaces['html'], 'details'),
  48. (namespaces['html'], 'dfn'),
  49. (namespaces['html'], 'dialog'),
  50. (namespaces['html'], 'dir'),
  51. (namespaces['html'], 'div'),
  52. (namespaces['html'], 'dl'),
  53. (namespaces['html'], 'dt'),
  54. (namespaces['html'], 'em'),
  55. (namespaces['html'], 'event-source'),
  56. (namespaces['html'], 'fieldset'),
  57. (namespaces['html'], 'figcaption'),
  58. (namespaces['html'], 'figure'),
  59. (namespaces['html'], 'footer'),
  60. (namespaces['html'], 'font'),
  61. (namespaces['html'], 'form'),
  62. (namespaces['html'], 'header'),
  63. (namespaces['html'], 'h1'),
  64. (namespaces['html'], 'h2'),
  65. (namespaces['html'], 'h3'),
  66. (namespaces['html'], 'h4'),
  67. (namespaces['html'], 'h5'),
  68. (namespaces['html'], 'h6'),
  69. (namespaces['html'], 'hr'),
  70. (namespaces['html'], 'i'),
  71. (namespaces['html'], 'img'),
  72. (namespaces['html'], 'input'),
  73. (namespaces['html'], 'ins'),
  74. (namespaces['html'], 'keygen'),
  75. (namespaces['html'], 'kbd'),
  76. (namespaces['html'], 'label'),
  77. (namespaces['html'], 'legend'),
  78. (namespaces['html'], 'li'),
  79. (namespaces['html'], 'm'),
  80. (namespaces['html'], 'map'),
  81. (namespaces['html'], 'menu'),
  82. (namespaces['html'], 'meter'),
  83. (namespaces['html'], 'multicol'),
  84. (namespaces['html'], 'nav'),
  85. (namespaces['html'], 'nextid'),
  86. (namespaces['html'], 'ol'),
  87. (namespaces['html'], 'output'),
  88. (namespaces['html'], 'optgroup'),
  89. (namespaces['html'], 'option'),
  90. (namespaces['html'], 'p'),
  91. (namespaces['html'], 'pre'),
  92. (namespaces['html'], 'progress'),
  93. (namespaces['html'], 'q'),
  94. (namespaces['html'], 's'),
  95. (namespaces['html'], 'samp'),
  96. (namespaces['html'], 'section'),
  97. (namespaces['html'], 'select'),
  98. (namespaces['html'], 'small'),
  99. (namespaces['html'], 'sound'),
  100. (namespaces['html'], 'source'),
  101. (namespaces['html'], 'spacer'),
  102. (namespaces['html'], 'span'),
  103. (namespaces['html'], 'strike'),
  104. (namespaces['html'], 'strong'),
  105. (namespaces['html'], 'sub'),
  106. (namespaces['html'], 'sup'),
  107. (namespaces['html'], 'table'),
  108. (namespaces['html'], 'tbody'),
  109. (namespaces['html'], 'td'),
  110. (namespaces['html'], 'textarea'),
  111. (namespaces['html'], 'time'),
  112. (namespaces['html'], 'tfoot'),
  113. (namespaces['html'], 'th'),
  114. (namespaces['html'], 'thead'),
  115. (namespaces['html'], 'tr'),
  116. (namespaces['html'], 'tt'),
  117. (namespaces['html'], 'u'),
  118. (namespaces['html'], 'ul'),
  119. (namespaces['html'], 'var'),
  120. (namespaces['html'], 'video'),
  121. (namespaces['mathml'], 'maction'),
  122. (namespaces['mathml'], 'math'),
  123. (namespaces['mathml'], 'merror'),
  124. (namespaces['mathml'], 'mfrac'),
  125. (namespaces['mathml'], 'mi'),
  126. (namespaces['mathml'], 'mmultiscripts'),
  127. (namespaces['mathml'], 'mn'),
  128. (namespaces['mathml'], 'mo'),
  129. (namespaces['mathml'], 'mover'),
  130. (namespaces['mathml'], 'mpadded'),
  131. (namespaces['mathml'], 'mphantom'),
  132. (namespaces['mathml'], 'mprescripts'),
  133. (namespaces['mathml'], 'mroot'),
  134. (namespaces['mathml'], 'mrow'),
  135. (namespaces['mathml'], 'mspace'),
  136. (namespaces['mathml'], 'msqrt'),
  137. (namespaces['mathml'], 'mstyle'),
  138. (namespaces['mathml'], 'msub'),
  139. (namespaces['mathml'], 'msubsup'),
  140. (namespaces['mathml'], 'msup'),
  141. (namespaces['mathml'], 'mtable'),
  142. (namespaces['mathml'], 'mtd'),
  143. (namespaces['mathml'], 'mtext'),
  144. (namespaces['mathml'], 'mtr'),
  145. (namespaces['mathml'], 'munder'),
  146. (namespaces['mathml'], 'munderover'),
  147. (namespaces['mathml'], 'none'),
  148. (namespaces['svg'], 'a'),
  149. (namespaces['svg'], 'animate'),
  150. (namespaces['svg'], 'animateColor'),
  151. (namespaces['svg'], 'animateMotion'),
  152. (namespaces['svg'], 'animateTransform'),
  153. (namespaces['svg'], 'clipPath'),
  154. (namespaces['svg'], 'circle'),
  155. (namespaces['svg'], 'defs'),
  156. (namespaces['svg'], 'desc'),
  157. (namespaces['svg'], 'ellipse'),
  158. (namespaces['svg'], 'font-face'),
  159. (namespaces['svg'], 'font-face-name'),
  160. (namespaces['svg'], 'font-face-src'),
  161. (namespaces['svg'], 'g'),
  162. (namespaces['svg'], 'glyph'),
  163. (namespaces['svg'], 'hkern'),
  164. (namespaces['svg'], 'linearGradient'),
  165. (namespaces['svg'], 'line'),
  166. (namespaces['svg'], 'marker'),
  167. (namespaces['svg'], 'metadata'),
  168. (namespaces['svg'], 'missing-glyph'),
  169. (namespaces['svg'], 'mpath'),
  170. (namespaces['svg'], 'path'),
  171. (namespaces['svg'], 'polygon'),
  172. (namespaces['svg'], 'polyline'),
  173. (namespaces['svg'], 'radialGradient'),
  174. (namespaces['svg'], 'rect'),
  175. (namespaces['svg'], 'set'),
  176. (namespaces['svg'], 'stop'),
  177. (namespaces['svg'], 'svg'),
  178. (namespaces['svg'], 'switch'),
  179. (namespaces['svg'], 'text'),
  180. (namespaces['svg'], 'title'),
  181. (namespaces['svg'], 'tspan'),
  182. (namespaces['svg'], 'use'),
  183. ))
  184. allowed_attributes = frozenset((
  185. # HTML attributes
  186. (None, 'abbr'),
  187. (None, 'accept'),
  188. (None, 'accept-charset'),
  189. (None, 'accesskey'),
  190. (None, 'action'),
  191. (None, 'align'),
  192. (None, 'alt'),
  193. (None, 'autocomplete'),
  194. (None, 'autofocus'),
  195. (None, 'axis'),
  196. (None, 'background'),
  197. (None, 'balance'),
  198. (None, 'bgcolor'),
  199. (None, 'bgproperties'),
  200. (None, 'border'),
  201. (None, 'bordercolor'),
  202. (None, 'bordercolordark'),
  203. (None, 'bordercolorlight'),
  204. (None, 'bottompadding'),
  205. (None, 'cellpadding'),
  206. (None, 'cellspacing'),
  207. (None, 'ch'),
  208. (None, 'challenge'),
  209. (None, 'char'),
  210. (None, 'charoff'),
  211. (None, 'choff'),
  212. (None, 'charset'),
  213. (None, 'checked'),
  214. (None, 'cite'),
  215. (None, 'class'),
  216. (None, 'clear'),
  217. (None, 'color'),
  218. (None, 'cols'),
  219. (None, 'colspan'),
  220. (None, 'compact'),
  221. (None, 'contenteditable'),
  222. (None, 'controls'),
  223. (None, 'coords'),
  224. (None, 'data'),
  225. (None, 'datafld'),
  226. (None, 'datapagesize'),
  227. (None, 'datasrc'),
  228. (None, 'datetime'),
  229. (None, 'default'),
  230. (None, 'delay'),
  231. (None, 'dir'),
  232. (None, 'disabled'),
  233. (None, 'draggable'),
  234. (None, 'dynsrc'),
  235. (None, 'enctype'),
  236. (None, 'end'),
  237. (None, 'face'),
  238. (None, 'for'),
  239. (None, 'form'),
  240. (None, 'frame'),
  241. (None, 'galleryimg'),
  242. (None, 'gutter'),
  243. (None, 'headers'),
  244. (None, 'height'),
  245. (None, 'hidefocus'),
  246. (None, 'hidden'),
  247. (None, 'high'),
  248. (None, 'href'),
  249. (None, 'hreflang'),
  250. (None, 'hspace'),
  251. (None, 'icon'),
  252. (None, 'id'),
  253. (None, 'inputmode'),
  254. (None, 'ismap'),
  255. (None, 'keytype'),
  256. (None, 'label'),
  257. (None, 'leftspacing'),
  258. (None, 'lang'),
  259. (None, 'list'),
  260. (None, 'longdesc'),
  261. (None, 'loop'),
  262. (None, 'loopcount'),
  263. (None, 'loopend'),
  264. (None, 'loopstart'),
  265. (None, 'low'),
  266. (None, 'lowsrc'),
  267. (None, 'max'),
  268. (None, 'maxlength'),
  269. (None, 'media'),
  270. (None, 'method'),
  271. (None, 'min'),
  272. (None, 'multiple'),
  273. (None, 'name'),
  274. (None, 'nohref'),
  275. (None, 'noshade'),
  276. (None, 'nowrap'),
  277. (None, 'open'),
  278. (None, 'optimum'),
  279. (None, 'pattern'),
  280. (None, 'ping'),
  281. (None, 'point-size'),
  282. (None, 'poster'),
  283. (None, 'pqg'),
  284. (None, 'preload'),
  285. (None, 'prompt'),
  286. (None, 'radiogroup'),
  287. (None, 'readonly'),
  288. (None, 'rel'),
  289. (None, 'repeat-max'),
  290. (None, 'repeat-min'),
  291. (None, 'replace'),
  292. (None, 'required'),
  293. (None, 'rev'),
  294. (None, 'rightspacing'),
  295. (None, 'rows'),
  296. (None, 'rowspan'),
  297. (None, 'rules'),
  298. (None, 'scope'),
  299. (None, 'selected'),
  300. (None, 'shape'),
  301. (None, 'size'),
  302. (None, 'span'),
  303. (None, 'src'),
  304. (None, 'start'),
  305. (None, 'step'),
  306. (None, 'style'),
  307. (None, 'summary'),
  308. (None, 'suppress'),
  309. (None, 'tabindex'),
  310. (None, 'target'),
  311. (None, 'template'),
  312. (None, 'title'),
  313. (None, 'toppadding'),
  314. (None, 'type'),
  315. (None, 'unselectable'),
  316. (None, 'usemap'),
  317. (None, 'urn'),
  318. (None, 'valign'),
  319. (None, 'value'),
  320. (None, 'variable'),
  321. (None, 'volume'),
  322. (None, 'vspace'),
  323. (None, 'vrml'),
  324. (None, 'width'),
  325. (None, 'wrap'),
  326. (namespaces['xml'], 'lang'),
  327. # MathML attributes
  328. (None, 'actiontype'),
  329. (None, 'align'),
  330. (None, 'columnalign'),
  331. (None, 'columnalign'),
  332. (None, 'columnalign'),
  333. (None, 'columnlines'),
  334. (None, 'columnspacing'),
  335. (None, 'columnspan'),
  336. (None, 'depth'),
  337. (None, 'display'),
  338. (None, 'displaystyle'),
  339. (None, 'equalcolumns'),
  340. (None, 'equalrows'),
  341. (None, 'fence'),
  342. (None, 'fontstyle'),
  343. (None, 'fontweight'),
  344. (None, 'frame'),
  345. (None, 'height'),
  346. (None, 'linethickness'),
  347. (None, 'lspace'),
  348. (None, 'mathbackground'),
  349. (None, 'mathcolor'),
  350. (None, 'mathvariant'),
  351. (None, 'mathvariant'),
  352. (None, 'maxsize'),
  353. (None, 'minsize'),
  354. (None, 'other'),
  355. (None, 'rowalign'),
  356. (None, 'rowalign'),
  357. (None, 'rowalign'),
  358. (None, 'rowlines'),
  359. (None, 'rowspacing'),
  360. (None, 'rowspan'),
  361. (None, 'rspace'),
  362. (None, 'scriptlevel'),
  363. (None, 'selection'),
  364. (None, 'separator'),
  365. (None, 'stretchy'),
  366. (None, 'width'),
  367. (None, 'width'),
  368. (namespaces['xlink'], 'href'),
  369. (namespaces['xlink'], 'show'),
  370. (namespaces['xlink'], 'type'),
  371. # SVG attributes
  372. (None, 'accent-height'),
  373. (None, 'accumulate'),
  374. (None, 'additive'),
  375. (None, 'alphabetic'),
  376. (None, 'arabic-form'),
  377. (None, 'ascent'),
  378. (None, 'attributeName'),
  379. (None, 'attributeType'),
  380. (None, 'baseProfile'),
  381. (None, 'bbox'),
  382. (None, 'begin'),
  383. (None, 'by'),
  384. (None, 'calcMode'),
  385. (None, 'cap-height'),
  386. (None, 'class'),
  387. (None, 'clip-path'),
  388. (None, 'color'),
  389. (None, 'color-rendering'),
  390. (None, 'content'),
  391. (None, 'cx'),
  392. (None, 'cy'),
  393. (None, 'd'),
  394. (None, 'dx'),
  395. (None, 'dy'),
  396. (None, 'descent'),
  397. (None, 'display'),
  398. (None, 'dur'),
  399. (None, 'end'),
  400. (None, 'fill'),
  401. (None, 'fill-opacity'),
  402. (None, 'fill-rule'),
  403. (None, 'font-family'),
  404. (None, 'font-size'),
  405. (None, 'font-stretch'),
  406. (None, 'font-style'),
  407. (None, 'font-variant'),
  408. (None, 'font-weight'),
  409. (None, 'from'),
  410. (None, 'fx'),
  411. (None, 'fy'),
  412. (None, 'g1'),
  413. (None, 'g2'),
  414. (None, 'glyph-name'),
  415. (None, 'gradientUnits'),
  416. (None, 'hanging'),
  417. (None, 'height'),
  418. (None, 'horiz-adv-x'),
  419. (None, 'horiz-origin-x'),
  420. (None, 'id'),
  421. (None, 'ideographic'),
  422. (None, 'k'),
  423. (None, 'keyPoints'),
  424. (None, 'keySplines'),
  425. (None, 'keyTimes'),
  426. (None, 'lang'),
  427. (None, 'marker-end'),
  428. (None, 'marker-mid'),
  429. (None, 'marker-start'),
  430. (None, 'markerHeight'),
  431. (None, 'markerUnits'),
  432. (None, 'markerWidth'),
  433. (None, 'mathematical'),
  434. (None, 'max'),
  435. (None, 'min'),
  436. (None, 'name'),
  437. (None, 'offset'),
  438. (None, 'opacity'),
  439. (None, 'orient'),
  440. (None, 'origin'),
  441. (None, 'overline-position'),
  442. (None, 'overline-thickness'),
  443. (None, 'panose-1'),
  444. (None, 'path'),
  445. (None, 'pathLength'),
  446. (None, 'points'),
  447. (None, 'preserveAspectRatio'),
  448. (None, 'r'),
  449. (None, 'refX'),
  450. (None, 'refY'),
  451. (None, 'repeatCount'),
  452. (None, 'repeatDur'),
  453. (None, 'requiredExtensions'),
  454. (None, 'requiredFeatures'),
  455. (None, 'restart'),
  456. (None, 'rotate'),
  457. (None, 'rx'),
  458. (None, 'ry'),
  459. (None, 'slope'),
  460. (None, 'stemh'),
  461. (None, 'stemv'),
  462. (None, 'stop-color'),
  463. (None, 'stop-opacity'),
  464. (None, 'strikethrough-position'),
  465. (None, 'strikethrough-thickness'),
  466. (None, 'stroke'),
  467. (None, 'stroke-dasharray'),
  468. (None, 'stroke-dashoffset'),
  469. (None, 'stroke-linecap'),
  470. (None, 'stroke-linejoin'),
  471. (None, 'stroke-miterlimit'),
  472. (None, 'stroke-opacity'),
  473. (None, 'stroke-width'),
  474. (None, 'systemLanguage'),
  475. (None, 'target'),
  476. (None, 'text-anchor'),
  477. (None, 'to'),
  478. (None, 'transform'),
  479. (None, 'type'),
  480. (None, 'u1'),
  481. (None, 'u2'),
  482. (None, 'underline-position'),
  483. (None, 'underline-thickness'),
  484. (None, 'unicode'),
  485. (None, 'unicode-range'),
  486. (None, 'units-per-em'),
  487. (None, 'values'),
  488. (None, 'version'),
  489. (None, 'viewBox'),
  490. (None, 'visibility'),
  491. (None, 'width'),
  492. (None, 'widths'),
  493. (None, 'x'),
  494. (None, 'x-height'),
  495. (None, 'x1'),
  496. (None, 'x2'),
  497. (namespaces['xlink'], 'actuate'),
  498. (namespaces['xlink'], 'arcrole'),
  499. (namespaces['xlink'], 'href'),
  500. (namespaces['xlink'], 'role'),
  501. (namespaces['xlink'], 'show'),
  502. (namespaces['xlink'], 'title'),
  503. (namespaces['xlink'], 'type'),
  504. (namespaces['xml'], 'base'),
  505. (namespaces['xml'], 'lang'),
  506. (namespaces['xml'], 'space'),
  507. (None, 'y'),
  508. (None, 'y1'),
  509. (None, 'y2'),
  510. (None, 'zoomAndPan'),
  511. ))
  512. attr_val_is_uri = frozenset((
  513. (None, 'href'),
  514. (None, 'src'),
  515. (None, 'cite'),
  516. (None, 'action'),
  517. (None, 'longdesc'),
  518. (None, 'poster'),
  519. (None, 'background'),
  520. (None, 'datasrc'),
  521. (None, 'dynsrc'),
  522. (None, 'lowsrc'),
  523. (None, 'ping'),
  524. (namespaces['xlink'], 'href'),
  525. (namespaces['xml'], 'base'),
  526. ))
  527. svg_attr_val_allows_ref = frozenset((
  528. (None, 'clip-path'),
  529. (None, 'color-profile'),
  530. (None, 'cursor'),
  531. (None, 'fill'),
  532. (None, 'filter'),
  533. (None, 'marker'),
  534. (None, 'marker-start'),
  535. (None, 'marker-mid'),
  536. (None, 'marker-end'),
  537. (None, 'mask'),
  538. (None, 'stroke'),
  539. ))
  540. svg_allow_local_href = frozenset((
  541. (None, 'altGlyph'),
  542. (None, 'animate'),
  543. (None, 'animateColor'),
  544. (None, 'animateMotion'),
  545. (None, 'animateTransform'),
  546. (None, 'cursor'),
  547. (None, 'feImage'),
  548. (None, 'filter'),
  549. (None, 'linearGradient'),
  550. (None, 'pattern'),
  551. (None, 'radialGradient'),
  552. (None, 'textpath'),
  553. (None, 'tref'),
  554. (None, 'set'),
  555. (None, 'use')
  556. ))
  557. allowed_css_properties = frozenset((
  558. 'azimuth',
  559. 'background-color',
  560. 'border-bottom-color',
  561. 'border-collapse',
  562. 'border-color',
  563. 'border-left-color',
  564. 'border-right-color',
  565. 'border-top-color',
  566. 'clear',
  567. 'color',
  568. 'cursor',
  569. 'direction',
  570. 'display',
  571. 'elevation',
  572. 'float',
  573. 'font',
  574. 'font-family',
  575. 'font-size',
  576. 'font-style',
  577. 'font-variant',
  578. 'font-weight',
  579. 'height',
  580. 'letter-spacing',
  581. 'line-height',
  582. 'overflow',
  583. 'pause',
  584. 'pause-after',
  585. 'pause-before',
  586. 'pitch',
  587. 'pitch-range',
  588. 'richness',
  589. 'speak',
  590. 'speak-header',
  591. 'speak-numeral',
  592. 'speak-punctuation',
  593. 'speech-rate',
  594. 'stress',
  595. 'text-align',
  596. 'text-decoration',
  597. 'text-indent',
  598. 'unicode-bidi',
  599. 'vertical-align',
  600. 'voice-family',
  601. 'volume',
  602. 'white-space',
  603. 'width',
  604. ))
  605. allowed_css_keywords = frozenset((
  606. 'auto',
  607. 'aqua',
  608. 'black',
  609. 'block',
  610. 'blue',
  611. 'bold',
  612. 'both',
  613. 'bottom',
  614. 'brown',
  615. 'center',
  616. 'collapse',
  617. 'dashed',
  618. 'dotted',
  619. 'fuchsia',
  620. 'gray',
  621. 'green',
  622. '!important',
  623. 'italic',
  624. 'left',
  625. 'lime',
  626. 'maroon',
  627. 'medium',
  628. 'none',
  629. 'navy',
  630. 'normal',
  631. 'nowrap',
  632. 'olive',
  633. 'pointer',
  634. 'purple',
  635. 'red',
  636. 'right',
  637. 'solid',
  638. 'silver',
  639. 'teal',
  640. 'top',
  641. 'transparent',
  642. 'underline',
  643. 'white',
  644. 'yellow',
  645. ))
  646. allowed_svg_properties = frozenset((
  647. 'fill',
  648. 'fill-opacity',
  649. 'fill-rule',
  650. 'stroke',
  651. 'stroke-width',
  652. 'stroke-linecap',
  653. 'stroke-linejoin',
  654. 'stroke-opacity',
  655. ))
  656. allowed_protocols = frozenset((
  657. 'ed2k',
  658. 'ftp',
  659. 'http',
  660. 'https',
  661. 'irc',
  662. 'mailto',
  663. 'news',
  664. 'gopher',
  665. 'nntp',
  666. 'telnet',
  667. 'webcal',
  668. 'xmpp',
  669. 'callto',
  670. 'feed',
  671. 'urn',
  672. 'aim',
  673. 'rsync',
  674. 'tag',
  675. 'ssh',
  676. 'sftp',
  677. 'rtsp',
  678. 'afs',
  679. 'data',
  680. ))
  681. allowed_content_types = frozenset((
  682. 'image/png',
  683. 'image/jpeg',
  684. 'image/gif',
  685. 'image/webp',
  686. 'image/bmp',
  687. 'text/plain',
  688. ))
  689. data_content_type = re.compile(r'''
  690. ^
  691. # Match a content type <application>/<type>
  692. (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
  693. # Match any character set and encoding
  694. (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
  695. |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
  696. # Assume the rest is data
  697. ,.*
  698. $
  699. ''',
  700. re.VERBOSE)
  701. class Filter(base.Filter):
  702. """Sanitizes token stream of XHTML+MathML+SVG and of inline style attributes"""
  703. def __init__(self,
  704. source,
  705. allowed_elements=allowed_elements,
  706. allowed_attributes=allowed_attributes,
  707. allowed_css_properties=allowed_css_properties,
  708. allowed_css_keywords=allowed_css_keywords,
  709. allowed_svg_properties=allowed_svg_properties,
  710. allowed_protocols=allowed_protocols,
  711. allowed_content_types=allowed_content_types,
  712. attr_val_is_uri=attr_val_is_uri,
  713. svg_attr_val_allows_ref=svg_attr_val_allows_ref,
  714. svg_allow_local_href=svg_allow_local_href):
  715. """Creates a Filter
  716. :arg allowed_elements: set of elements to allow--everything else will
  717. be escaped
  718. :arg allowed_attributes: set of attributes to allow in
  719. elements--everything else will be stripped
  720. :arg allowed_css_properties: set of CSS properties to allow--everything
  721. else will be stripped
  722. :arg allowed_css_keywords: set of CSS keywords to allow--everything
  723. else will be stripped
  724. :arg allowed_svg_properties: set of SVG properties to allow--everything
  725. else will be removed
  726. :arg allowed_protocols: set of allowed protocols for URIs
  727. :arg allowed_content_types: set of allowed content types for ``data`` URIs.
  728. :arg attr_val_is_uri: set of attributes that have URI values--values
  729. that have a scheme not listed in ``allowed_protocols`` are removed
  730. :arg svg_attr_val_allows_ref: set of SVG attributes that can have
  731. references
  732. :arg svg_allow_local_href: set of SVG elements that can have local
  733. hrefs--these are removed
  734. """
  735. super(Filter, self).__init__(source)
  736. warnings.warn(_deprecation_msg, DeprecationWarning)
  737. self.allowed_elements = allowed_elements
  738. self.allowed_attributes = allowed_attributes
  739. self.allowed_css_properties = allowed_css_properties
  740. self.allowed_css_keywords = allowed_css_keywords
  741. self.allowed_svg_properties = allowed_svg_properties
  742. self.allowed_protocols = allowed_protocols
  743. self.allowed_content_types = allowed_content_types
  744. self.attr_val_is_uri = attr_val_is_uri
  745. self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
  746. self.svg_allow_local_href = svg_allow_local_href
  747. def __iter__(self):
  748. for token in base.Filter.__iter__(self):
  749. token = self.sanitize_token(token)
  750. if token:
  751. yield token
  752. # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
  753. # stripping out all attributes not in ALLOWED_ATTRIBUTES. Style attributes
  754. # are parsed, and a restricted set, specified by ALLOWED_CSS_PROPERTIES and
  755. # ALLOWED_CSS_KEYWORDS, are allowed through. attributes in ATTR_VAL_IS_URI
  756. # are scanned, and only URI schemes specified in ALLOWED_PROTOCOLS are
  757. # allowed.
  758. #
  759. # sanitize_html('<script> do_nasty_stuff() </script>')
  760. # => &lt;script> do_nasty_stuff() &lt;/script>
  761. # sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
  762. # => <a>Click here for $100</a>
  763. def sanitize_token(self, token):
  764. # accommodate filters which use token_type differently
  765. token_type = token["type"]
  766. if token_type in ("StartTag", "EndTag", "EmptyTag"):
  767. name = token["name"]
  768. namespace = token["namespace"]
  769. if ((namespace, name) in self.allowed_elements or
  770. (namespace is None and
  771. (namespaces["html"], name) in self.allowed_elements)):
  772. return self.allowed_token(token)
  773. else:
  774. return self.disallowed_token(token)
  775. elif token_type == "Comment":
  776. pass
  777. else:
  778. return token
  779. def allowed_token(self, token):
  780. if "data" in token:
  781. attrs = token["data"]
  782. attr_names = set(attrs.keys())
  783. # Remove forbidden attributes
  784. for to_remove in (attr_names - self.allowed_attributes):
  785. del token["data"][to_remove]
  786. attr_names.remove(to_remove)
  787. # Remove attributes with disallowed URL values
  788. for attr in (attr_names & self.attr_val_is_uri):
  789. assert attr in attrs
  790. # I don't have a clue where this regexp comes from or why it matches those
  791. # characters, nor why we call unescape. I just know it's always been here.
  792. # Should you be worried by this comment in a sanitizer? Yes. On the other hand, all
  793. # this will do is remove *more* than it otherwise would.
  794. val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\\s]+", '',
  795. unescape(attrs[attr])).lower()
  796. # remove replacement characters from unescaped characters
  797. val_unescaped = val_unescaped.replace("\ufffd", "")
  798. try:
  799. uri = urlparse.urlparse(val_unescaped)
  800. except ValueError:
  801. uri = None
  802. del attrs[attr]
  803. if uri and uri.scheme:
  804. if uri.scheme not in self.allowed_protocols:
  805. del attrs[attr]
  806. if uri.scheme == 'data':
  807. m = data_content_type.match(uri.path)
  808. if not m:
  809. del attrs[attr]
  810. elif m.group('content_type') not in self.allowed_content_types:
  811. del attrs[attr]
  812. for attr in self.svg_attr_val_allows_ref:
  813. if attr in attrs:
  814. attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
  815. ' ',
  816. unescape(attrs[attr]))
  817. if (token["name"] in self.svg_allow_local_href and
  818. (namespaces['xlink'], 'href') in attrs and re.search(r'^\s*[^#\s].*',
  819. attrs[(namespaces['xlink'], 'href')])):
  820. del attrs[(namespaces['xlink'], 'href')]
  821. if (None, 'style') in attrs:
  822. attrs[(None, 'style')] = self.sanitize_css(attrs[(None, 'style')])
  823. token["data"] = attrs
  824. return token
  825. def disallowed_token(self, token):
  826. token_type = token["type"]
  827. if token_type == "EndTag":
  828. token["data"] = "</%s>" % token["name"]
  829. elif token["data"]:
  830. assert token_type in ("StartTag", "EmptyTag")
  831. attrs = []
  832. for (ns, name), v in token["data"].items():
  833. attrs.append(' %s="%s"' % (name if ns is None else "%s:%s" % (prefixes[ns], name), escape(v)))
  834. token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
  835. else:
  836. token["data"] = "<%s>" % token["name"]
  837. if token.get("selfClosing"):
  838. token["data"] = token["data"][:-1] + "/>"
  839. token["type"] = "Characters"
  840. del token["name"]
  841. return token
  842. def sanitize_css(self, style):
  843. # disallow urls
  844. style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
  845. # gauntlet
  846. if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
  847. return ''
  848. if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
  849. return ''
  850. clean = []
  851. for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
  852. if not value:
  853. continue
  854. if prop.lower() in self.allowed_css_properties:
  855. clean.append(prop + ': ' + value + ';')
  856. elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
  857. 'padding']:
  858. for keyword in value.split():
  859. if keyword not in self.allowed_css_keywords and \
  860. not re.match(r"^(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa
  861. break
  862. else:
  863. clean.append(prop + ': ' + value + ';')
  864. elif prop.lower() in self.allowed_svg_properties:
  865. clean.append(prop + ': ' + value + ';')
  866. return ' '.join(clean)